Author: Amanda Everitt
Began: 8/25/2018
Finished:: 8/27/2018
load(file=paste0(wd, "/outputs/output_03/neuronal_object.RData"))
experiment.aggregate@meta.data$group = "removed"
experiment.aggregate@meta.data[experiment.aggregate@meta.data$res.0.3 %in% c(0:2),]$group = "group1"
experiment.aggregate@meta.data[experiment.aggregate@meta.data$res.0.3 == 3 ,]$group = "group2"
TSNEPlot(object = experiment.aggregate, group.by="orig.ident", pt.size=0.5)
TSNEPlot(object = experiment.aggregate, group.by="group", pt.size=0.5, colors.use = c("red","blue","grey"))
identified_cells <- rownames(experiment.aggregate@meta.data[experiment.aggregate@meta.data$group %in% c("group1","group2"),])
experiment.aggregate <- SubsetData(experiment.aggregate, cells.use = identified_cells, do.clean = T)
counts = as.matrix(experiment.aggregate@raw.data[, colnames(experiment.aggregate@data)])
norm_counts = as.matrix(experiment.aggregate@data)
cData = data.frame(nGene = as.integer(experiment.aggregate@meta.data$nGene),
nUMI = as.integer(experiment.aggregate@meta.data$nUMI),
GT = gsub(".*L5|/", "", rownames(experiment.aggregate@meta.data)),
group = experiment.aggregate@meta.data$group
)
rownames(cData) = colnames(counts)
#Row Data
means <- rowMeans(as.matrix(experiment.aggregate@data))
vars <- rowVars(as.matrix(experiment.aggregate@data))
rData = data.frame(NumberCellsOccurIn = as.numeric(rowSums(as.matrix(experiment.aggregate@data) > 0)),
MeanExpr = as.numeric(means),
Var = as.numeric(vars),
CoefofVar = vars/means^2
)
rownames(rData) = rownames(counts)
core = SingleCellExperiment(assays = list(counts = counts), colData = cData, rowData = rData)
core_norm = SingleCellExperiment(assays = list(counts = norm_counts), colData = cData, rowData = rData)
#not evaluating here so file isn't constantly overwritten
save(core, file=paste0(out_dir, "/SC_core.RData"))
save(core_norm, file=paste0(out_dir, "/SC_core_norm.RData"))
#!/bin/bash
#install R
yum -y remove gcc72-c++.x86_64 libgcc72.x86_64
yum -y install gcc-gfortran.noarch
yum-builddep -y R
yum -y install gcc-c++
yum -y install curl-devel
yum -y install libxml2-devel
yum -y install gsl-devel
yum -y install openssl-devel
yum -y install libpng-devel
yum -y install java-1.8.0-openjdk-src.x86_64
ln -s /usr/lib/gcc/x86_64-amazon-linux/6.4.1/libquadmath.so /usr/lib/libquadmath.so
ln -s /usr/lib/gcc/x86_64-amazon-linux/6.4.1/libgfortran.so /usr/lib/libgfortran.so
yum -y install git
wget https://cran.r-project.org/src/base/R-3/R-3.5.1.tar.gz
tar -xzf R-3.5.1.tar.gz
rm -rf R-3.5.1.tar.gz
mkdir /progs
cd R-3.5.1/
./configure --prefix=/progs/3.5.1 --enable-R-shlib
make
make install
cd
ln -s /progs/3.5.1/bin/R /usr/bin/
#install RStudio-Server 1.0
wget https://download2.rstudio.org/rstudio-server-rhel-1.0.153-x86_64.rpm
yum install -y --nogpgcheck rstudio-server-rhel-1.0.153-x86_64.rpm
rm -rf rstudio-server-rhel-1.0.153-x86_64.rpm
sudo su
rstudio-server verify-installation
adduser r_user
passwd r_user #follow prompts
rstudio-server start
#Install notebook packages as you wait for below to finish bc that will take awhile
install.packages("BiocManager")
BiocManager::install(version = '3.8')
BiocManager::install("zinbwave")
BiocManager::install("DESeq2")
BiocManager::install(c("dplyr", "doParallel", "MAST"))
#FYI Seurat won't load bc dependency hdf5r can't load. yum install not available for this version and I don't feel like doing it from source -- will just do seurat part on local
runSeurat <- function(out_dir, my.method) {
suppressPackageStartupMessages(require(Seurat))
load(file=paste0(wd, "/outputs/output_03/neuronal_object.RData"))
experiment.aggregate@meta.data$group = "removed"
experiment.aggregate@meta.data[experiment.aggregate@meta.data$res.0.3 %in% c(0:2),]$group = "group1"
experiment.aggregate@meta.data[experiment.aggregate@meta.data$res.0.3 == 3 ,]$group = "group2"
experiment.aggregate <- SetAllIdent(experiment.aggregate, id = "group")
group1.v.group2 <- FindMarkers(object = experiment.aggregate,
ident.1 = "group1",
ident.2 = "group2",
logfc.threshold = 0,
genes.use = rownames(experiment.aggregate@raw.data), #ALL data
thresh.use = 0, test.use = my.method,
min.pct = 0, random.seed = 1,
max.cells.per.ident = Inf)
group1.v.group2$method = 'seurat'
group1.v.group2$gene <- rownames(group1.v.group2)
group1.v.group2 = group1.v.group2[, c('gene', 'p_val', 'p_val_adj', 'avg_logFC', 'method')]
colnames(group1.v.group2) = c("gene", "pval", "padj", "logfc", "method") #make column names consistent across all
write.table(group1.v.group2, file=paste0(out_dir, "/seurat_", my.method,".csv"))
}
runSeurat(out_dir= paste0(out_dir,"/programs/"), my.method="wilcox")
runSeurat(out_dir= paste0(out_dir,"/programs/"), my.method="negbinom")
#wants raw data
fit_edgeR <- function(counts, design, filter = NULL){
suppressPackageStartupMessages(library(edgeR))
d = DGEList(counts)
d = suppressWarnings(calcNormFactors(d)) #Normalize by Effective library size
d = estimateDisp(d, design) #will take awhile
jpeg("EdgeR_bcv.jpeg")
plotBCV(d, cex.main=0.8,main = paste("Estimated biological common sq-rt-dispersion is:",
round(sqrt(d$common.dispersion), digits=2) ,
"\n(should be below 0.4 for human datasets arising from well-controlled experiments)"))
dev.off()
fit = glmFit(d, design)
head(fit$coefficients)
glm_1v2 <- glmLRT(fit, coef="colData(core)$groupgroup2") #Preform likelihood ratio tests between full and reduced models
summary(decideTests(glm_1v2))
tab_1v2 = glm_1v2$table
tab_1v2$padj = p.adjust(tab_1v2$PValue, "BH")
tab_1v2$gene = rownames(tab_1v2)
de_1v2 <- as.data.frame(tab_1v2, stringsAsFactors = FALSE)
de_1v2 = de_1v2[, c('gene', 'PValue', 'padj', 'logFC')] #rearrange
colnames(de_1v2) = c('gene', 'pval', 'padj', 'logfc') #relabel
de_1v2$method <- 'edgeR'
write.table(de_1v2, file="edgeR.csv")
}
load("/home/r_user/SC_core.RData")
colData(core)$group <- relevel(colData(core)$group, ref = "group1")
fit_edgeR(assay(core), model.matrix(~colData(core)$group))
knitr::include_graphics(paste0(out_dir, '/programs/EdgeR_bcv.jpeg'))
#wants raw data
runLimmavoom <- function(counts, design) {
suppressPackageStartupMessages(library(limma))
suppressPackageStartupMessages(library(edgeR))
dgel <- DGEList(counts)
dgel <- edgeR::calcNormFactors(dgel) #Use norm factors from edgeR
v <- voom(dgel,design,plot=FALSE)
fit <- lmFit(v,design)
fit <- eBayes(fit)
tt_1v2 <- topTable(fit,coef="colData(core)$groupgroup2",n=nrow(dgel),sort.by="none")
padj_1v2 <- p.adjust(tt_1v2$P.Value,method="BH")
padj_1v2[is.na(padj_1v2)] <- 1
voom_1v2 <- data.frame(gene = rownames(tt_1v2), pval=tt_1v2$P.Value, padj=padj_1v2, logfc=tt_1v2$logFC, method = "limmavoom")
write.table(voom_1v2, file="voom.csv")
}
load("/home/r_user/SC_core.RData")
colData(core)$group <- relevel(colData(core)$group, ref = "group1")
runLimmavoom(assay(core), model.matrix(~colData(core)$group))
#wants raw data
run_Monocle <- function(cData, rData, counts){
suppressPackageStartupMessages(library("monocle"))
#Requires CellDataSet object and un-normalized counts
#cData$GT <- relevel(cData$GT, ref = "WT")
pd <- new("AnnotatedDataFrame", data = cData)
fd <- new("AnnotatedDataFrame", data = rData)
fd$gene_short_name <- rownames(fd)
HSMM <- newCellDataSet(as(counts, "sparseMatrix"),
phenoData = pd,
featureData = fd,
expressionFamily=negbinomial.size())
HSMM <- estimateSizeFactors(HSMM)
HSMM <- estimateDispersions(HSMM)
res_1v2 <- differentialGeneTest(HSMM, verbose=T, cores=10,
fullModelFormulaStr = "~group"
)
res_1v2 <- res_1v2[, c("pval","qval","gene_short_name")]
colnames(res_1v2) <- c("pval","padj","gene")
res_1v2$method <- "monocle"
res_1v2$logfc <- 0
res_1v2 <- res_1v2[, c('gene', 'pval', 'padj', 'logfc','method')]
write.table(res_1v2, file="monocle.csv")
disp_table <- dispersionTable(HSMM)
unsup_clustering_genes <- subset(disp_table, mean_expression >= 0.1)
HSMM <- setOrderingFilter(HSMM, unsup_clustering_genes$gene_id)
jpeg("monocle2_bcv.jpeg")
plot_ordering_genes(HSMM)
dev.off()
}
load("/home/r_user/SC_core.RData")
colData(core)$group <- relevel(colData(core)$group, ref = "group1")
cData = data.frame(colData(core))
rData = data.frame(rowData(core))
rownames(rData) = rownames(core)
run_Monocle(cData, rData, as.matrix(assay(core)))
knitr::include_graphics(paste0(out_dir, '/programs/monocle2_bcv.jpeg'))
#wants tpm
runMAST <- function(counts, design, cData) {
suppressPackageStartupMessages(library(MAST))
tpm <- counts*1e6/colSums(counts)
tpm <- log2(tpm+1)
sca <- FromMatrix(tpm,cData)
assays(sca) <- list(tpm=assay(sca))
ngeneson <- apply(counts,2,function(x) mean(x>0)) #mean of counts for each cell where gene was expressed
CD <- colData(sca)
CD$ngeneson <- ngeneson #save into a coldata copy
CD$cngeneson <- CD$ngeneson-mean(ngeneson) #mean of cell-mean of all cells
colData(sca) <- CD #add two variables to column data
## differential expression
fit <- zlm(~ cngeneson + group, sca = sca,
method = "bayesglm", ebayes = TRUE)
summaryDt = summary(fit, doLRT='groupgroup2')
summaryDt = summaryDt$datatable
fcHurdle <- merge(summaryDt[contrast=='groupgroup2'&component=='H',.(primerid, `Pr(>Chisq)`)],
summaryDt[contrast=='groupgroup2' & component=='logFC', .(primerid, coef, ci.hi, ci.lo)], by='primerid')
fcHurdle[,padj:=p.adjust(`Pr(>Chisq)`, 'fdr')]
df_1v2 = data.frame(gene = fcHurdle$primerid,
pval=fcHurdle[,'Pr(>Chisq)'], padj=fcHurdle$padj,
logfc=fcHurdle$coef, method="MAST")
colnames(df_1v2)[2] = 'pval'
write.table(df_1v2, file=paste0(out_dir, '/programs/MAST.csv'))
}
load("/home/r_user/SC_core.RData")
colData(core)$group <- relevel(colData(core)$group, ref = "group1")
runMAST(assay(core), model.matrix(~colData(core)$group), core@colData)
runMAST_thresh <- function(counts, design, cData, my.bins =20 , my.min.per.bin =30, outputfile) {
suppressPackageStartupMessages(library(MAST))
tpm <- counts*1e6/colSums(counts)
tpm <- log2(tpm+1)
sca <- FromMatrix(tpm,cData)
assays(sca) <- list(tpm=assay(sca))
ngeneson <- apply(counts,2,function(x) mean(x>0)) #mean of counts for each cell where gene was expressed
CD <- colData(sca)
CD$ngeneson <- ngeneson #save into a coldata copy
CD$cngeneson <- CD$ngeneson-mean(ngeneson) #mean of cell-mean of all cells
colData(sca) <- CD #add two variables to column data
thres <- thresholdSCRNACountMatrix(assay(sca), conditions = sca@colData$group, nbins = my.bins, min_per_bin = my.min.per.bin)
assays(sca) <- list(thresh=thres$counts_threshold, tpm=assay(sca))
jpeg(paste0(out_dir, '/programs/Mast_thresh.jpeg'))
plot(thres)
dev.off()
fit <- zlm(~ cngeneson + group, sca = sca,
method = "bayesglm", ebayes = TRUE)
summaryDt = summary(fit, doLRT='groupgroup2')
summaryDt = summaryDt$datatable
fcHurdle <- merge(summaryDt[contrast=='groupgroup2'&component=='H',.(primerid, `Pr(>Chisq)`)],
summaryDt[contrast=='groupgroup2' & component=='logFC', .(primerid, coef, ci.hi, ci.lo)], by='primerid')
fcHurdle[,padj:=p.adjust(`Pr(>Chisq)`, 'fdr')]
df_1v2 = data.frame(gene = fcHurdle$primerid,
pval=fcHurdle[,'Pr(>Chisq)'], padj=fcHurdle$padj,
logfc=fcHurdle$coef, method="MAST")
colnames(df_1v2)[2] = 'pval'
write.table(df_1v2, file=outputfile)
}
load("/home/r_user/SC_core.RData")
colData(core)$group <- relevel(colData(core)$group, ref = "group1")
runMAST_thresh(counts = assay(core),
design = model.matrix(~colData(core)$group),
cData = core@colData,
my.bins = 40,
my.min.per.bin = 10,
outputfile = paste0(out_dir, '/programs/MAST_thresh.csv'))
knitr::include_graphics(paste0(out_dir, '/programs/Mast_thresh.jpeg'))
#wants raw data
runROTS <- function(core){
#BiocManager::install("ROTS")
suppressPackageStartupMessages(library(ROTS))
counts = assay(core)
colData(core)$group <- relevel(colData(core)$group, ref = "group1")
if (all.equal(colnames(counts), rownames(colData(core)))){
groups = as.numeric(colData(core)$group)
table(groups)
}
results = ROTS(data = counts, groups = groups , B = 1000 , K = 500 , seed = 1234, progress=TRUE, log=TRUE)
res <- data.frame(gene=names(results$logfc), pval=results$pvalue, padj=results$FDR, logfc=results$logfc, method="ROTS")
write.table(res, file="ROTS.csv")
}
load("/home/r_user/SC_core.RData")
runROTS(core)
load(paste0(wd, "/outputs/output_03/neuronal_object.RData"))
experiment.aggregate@meta.data$group = "removed"
experiment.aggregate@meta.data[experiment.aggregate@meta.data$res.0.3 %in% c(0:2),]$group = "group1"
experiment.aggregate@meta.data[experiment.aggregate@meta.data$res.0.3 == 3 ,]$group = "group2"
TSNEPlot(object = experiment.aggregate, group.by="group", pt.size=0.5, colors.use = c("red","blue","grey"))
#Accidentally put the wrong reference level in above. switching here and re-saving
comp1 <- read.table("/Users/AEveritt/projects/scRNAseq_L5_Tbr1/output_04/programs/MAST_thresh.csv")
comp1$logfc <- (comp1$logfc * -1)
comp1 <- comp1[, c("gene","pval","padj","logfc")]
colnames(comp1) <- c("gene","pval","fdr","logfc")
write.csv(comp1, file=paste0(out_dir, "/Group2vsGroup1.csv"))
load(paste0(wd, "/outputs/output_03/neuronal_object.RData"))
experiment.aggregate@meta.data$group = "removed"
experiment.aggregate@meta.data[experiment.aggregate@meta.data$res.0.3 %in% c(0:3)
& experiment.aggregate@meta.data$orig.ident == "WT",]$group = "WT"
experiment.aggregate@meta.data[experiment.aggregate@meta.data$res.0.3 %in% c(0:3)
& experiment.aggregate@meta.data$orig.ident == "NULL",]$group = "NULL"
TSNEPlot(object = experiment.aggregate, group.by="group", pt.size=0.5, colors.use = c("red","grey","blue"))
counts = as.matrix(experiment.aggregate@raw.data[, rownames(experiment.aggregate@meta.data[experiment.aggregate@meta.data$group != "removed",])])
cData = experiment.aggregate@meta.data[experiment.aggregate@meta.data$group != "removed",]
table(cData$group)
##
## NULL WT
## 3935 1778
if (all.equal(rownames(cData), colnames(counts))){
cData$group <- as.factor(cData$group)
cData$group <- relevel(cData$group, ref = "WT")
runMAST_thresh(counts,
cData$group,
cData,
my.bins = 40,
my.min.per.bin = 10,
ref.name = "groupNULL",
outnames = '/WTvsNULL')
}
load(paste0(wd, "/outputs/output_03/neuronal_object.RData"))
experiment.aggregate@meta.data$group = "removed"
experiment.aggregate@meta.data[experiment.aggregate@meta.data$res.0.3 %in% c(0:3)
& experiment.aggregate@meta.data$orig.ident == "WT",]$group = "WT"
experiment.aggregate@meta.data[experiment.aggregate@meta.data$res.0.3 %in% c(0:3)
& experiment.aggregate@meta.data$orig.ident == "HET",]$group = "HET"
TSNEPlot(object = experiment.aggregate, group.by="group", pt.size=0.5, colors.use = c("red","grey","blue"))
counts = as.matrix(experiment.aggregate@raw.data[, rownames(experiment.aggregate@meta.data[experiment.aggregate@meta.data$group != "removed",])])
cData = experiment.aggregate@meta.data[experiment.aggregate@meta.data$group != "removed",]
table(cData$group)
##
## HET WT
## 5357 1778
if (all.equal(rownames(cData), colnames(counts))){
cData$group <- as.factor(cData$group)
cData$group <- relevel(cData$group, ref = "WT")
runMAST_thresh(counts,
cData$group,
cData,
my.bins = 40,
my.min.per.bin = 10,
ref.name = "groupHET",
outnames = '/WTvsHET')
}
load(paste0(wd, "/outputs/output_03/neuronal_object.RData"))
experiment.aggregate@meta.data$Tbr1pos <- ifelse(experiment.aggregate@data["Tbr1",] > 0, "yes","no")
experiment.aggregate@meta.data$group = "removed"
experiment.aggregate@meta.data[experiment.aggregate@meta.data$res.0.3 %in% c(0:3)
& experiment.aggregate@meta.data$orig.ident == "WT"
& experiment.aggregate@meta.data$Tbr1pos == "yes",]$group = "WT"
experiment.aggregate@meta.data[experiment.aggregate@meta.data$res.0.3 %in% c(0:3)
& experiment.aggregate@meta.data$orig.ident == "NULL"
& experiment.aggregate@meta.data$Tbr1pos == "yes" ,]$group = "NULL"
TSNEPlot(object = experiment.aggregate, group.by="group", pt.size=1.5, colors.use = c("red","grey","blue"))
counts = as.matrix(experiment.aggregate@raw.data[, rownames(experiment.aggregate@meta.data[experiment.aggregate@meta.data$group != "removed",])])
cData = experiment.aggregate@meta.data[experiment.aggregate@meta.data$group != "removed",]
table(cData$group)
##
## NULL WT
## 132 96
if (all.equal(rownames(cData), colnames(counts))){
cData$group <- as.factor(cData$group)
cData$group <- relevel(cData$group, ref = "WT")
runMAST_thresh(counts,
cData$group,
cData,
my.bins = 30,
my.min.per.bin = 10,
ref.name = "groupNULL",
outnames = '/Tbr1_WTvsTbr1_NULL')
}
load(paste0(wd, "/outputs/output_03/neuronal_object.RData"))
experiment.aggregate@meta.data$Tbr1pos <- ifelse(experiment.aggregate@data["Tbr1",] > 0, "yes","no")
experiment.aggregate@meta.data$group = "removed"
experiment.aggregate@meta.data[experiment.aggregate@meta.data$res.0.3 %in% c(0:3)
& experiment.aggregate@meta.data$orig.ident == "WT"
& experiment.aggregate@meta.data$Tbr1pos == "yes",]$group = "WT"
experiment.aggregate@meta.data[experiment.aggregate@meta.data$res.0.3 %in% c(0:3)
& experiment.aggregate@meta.data$orig.ident == "HET"
& experiment.aggregate@meta.data$Tbr1pos == "yes" ,]$group = "HET"
TSNEPlot(object = experiment.aggregate, group.by="group", pt.size=1.5, colors.use = c("red","grey","blue"))
counts = as.matrix(experiment.aggregate@raw.data[, rownames(experiment.aggregate@meta.data[experiment.aggregate@meta.data$group != "removed",])])
cData = experiment.aggregate@meta.data[experiment.aggregate@meta.data$group != "removed",]
table(cData$group)
##
## HET WT
## 302 96
if (all.equal(rownames(cData), colnames(counts))){
cData$group <- as.factor(cData$group)
cData$group <- relevel(cData$group, ref = "WT")
runMAST_thresh(counts,
cData$group,
cData,
my.bins = 30,
my.min.per.bin = 10,
ref.name = "groupHET",
outnames = '/Tbr1_WTvsTbr1_HET')
}